In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
In [2]:
sns.set(style='whitegrid')
In [3]:
products=pd.read_csv(r"../data/products_clean.csv")
In [4]:
df=pd.DataFrame(products)
TOP 5 HIGHEST RATED PRODUCTS
In [5]:
top_rated=df.sort_values(by='rating_rate',ascending=False).head(5)
print("Top 5 highest rated products are:")
print(top_rated[['title','rating_rate']])
Top 5 highest rated products are:
title rating_rate
10 Silicon Power 256GB SSD 3D NAND A55 SLC Cache ... 4.8
11 WD 4TB Gaming Drive Works with Playstation 4 P... 4.8
2 Mens Cotton Jacket 4.7
17 MBJ Women's Solid Short Sleeve Boat Neck V 4.7
4 John Hardy Women's Legends Naga Gold & Silver ... 4.6
TOP 5 MOST RATED PRODUCTS
In [6]:
top_most_rated=df.sort_values(by='rating_count',ascending=False).head(5)
print("Top 5 most rated products are:")
print(top_most_rated[['title','rating_count']])
Top 5 most rated products are:
title rating_count
16 Rain Jacket Women Windbreaker Striped Climbing... 679
2 Mens Cotton Jacket 500
9 SanDisk SSD PLUS 1TB Internal SSD - SATA III 6... 470
3 Mens Casual Slim Fit 430
4 John Hardy Women's Legends Naga Gold & Silver ... 400
5 MOST EXPENSIVE PRODUCTS AND 5 MOST CHEAPTEST PRODUCTS
In [7]:
most_expensive=df.sort_values(by='price',ascending=False).head(5)
cheaptest=df.sort_values(by='price',ascending=True).head(5)
print("Top 5 most expensive products are:")
print(most_expensive[['title','price']])
print("\nTop 5 cheapest products are:")
print(top_rated[['title','price']])
Top 5 most expensive products are:
title price
13 Samsung 49-Inch CHG90 144Hz Curved Gaming Moni... 999.99
4 John Hardy Women's Legends Naga Gold & Silver ... 695.00
12 Acer SB220Q bi 21.5 inches Full HD (1920 x 108... 599.00
5 Solid Gold Petite Micropave 168.00
11 WD 4TB Gaming Drive Works with Playstation 4 P... 114.00
Top 5 cheapest products are:
title price
10 Silicon Power 256GB SSD 3D NAND A55 SLC Cache ... 109.00
11 WD 4TB Gaming Drive Works with Playstation 4 P... 114.00
2 Mens Cotton Jacket 55.99
17 MBJ Women's Solid Short Sleeve Boat Neck V 9.85
4 John Hardy Women's Legends Naga Gold & Silver ... 695.00
NUMBER OF PRODUCTS PER CATEGORY
In [8]:
category_counts=df['category'].value_counts().reset_index()
category_counts.columns=['category','product_count']
fig=px.treemap(
category_counts,
path=['category'],
values='product_count',
color='product_count',
color_continuous_scale='agsunset',
title='Number of Products in Each Category'
)
fig.show()
fig.write_html("products_per_category.html")
AVERAGE PRICE PER PRODUCT CATEGORY
In [9]:
avg_price=df.groupby('category')['price'].mean().round(2).reset_index()
avg_price.rename(columns={'price':'avg_price'},inplace=True)
print("Average price per category is:")
print(avg_price)
Average price per category is:
category avg_price
0 electronics 332.50
1 jewelery 221.00
2 men's clothing 51.06
3 women's clothing 26.29
In [10]:
fig=px.bar(avg_price,x='category',y='avg_price',text='avg_price',color='category',title='Average Price Per Product Category')
fig.update_traces(texttemplate='%{text}',textposition='outside',hovertemplate='Category:%{x}<br>Avg Price:₹%{y}')
fig.update_layout(
xaxis_title='Product Category',yaxis_title='Average Price',showlegend=False,uniformtext_minsize=8,uniformtext_mode='hide',
)
fig.show()
fig.write_html("avg_price_product.html")
Relationship Between Rating Count And Price(And their ratings)
In [11]:
plt.figure(figsize=(10,6))
scatter=sns.scatterplot(
data=df,
x='price',
y='rating_count',
hue='rating_rate',
palette='coolwarm',
size='rating_count',
sizes=(20,200),
alpha=0.7,
edgecolor='gray'
)
sns.regplot(
data=df,
x='price',
y='rating_count',
scatter=False,
color='blue',
line_kws={"linewidth":2,"linestyle":"dashed"}
)
plt.title('Relationship between Rating Count & Price(colored by rating)')
plt.xlabel('Price')
plt.ylabel('Rating Count')
plt.legend(title='Ratings')
plt.tight_layout()
plt.show()
Interactive Graph for the above
In [12]:
fig = px.scatter(
df,
x='price',
y='rating_count',
color='rating_rate',
size='rating_count',
hover_data=['title', 'rating_rate', 'price', 'rating_count'],
color_continuous_scale='Viridis',
title='💬 Price vs Rating Count (Colored by Rating)'
)
fig.update_traces(marker=dict(opacity=0.7, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(
xaxis_title='Price (₹)',
yaxis_title='Rating Count',
coloraxis_colorbar=dict(title=' Rating'),
legend_title='Rating'
)
fig.show()
fig.write_html("price_vs_rating_count.html")
CORRELATION BETWEEN PRICE AND RATING
In [13]:
correlation=df[['price','rating_rate','rating_count']].corr()
print("Correlation Matrix")
print(correlation)
Correlation Matrix
price rating_rate rating_count
price 1.000000 -0.154229 -0.108585
rating_rate -0.154229 1.000000 0.117721
rating_count -0.108585 0.117721 1.000000
Impact of Category and Rating On Price
In [14]:
import sqlite3
# Connect to SQLite database
conn = sqlite3.connect("product_data.db")
# Save DataFrame into SQLite
df.to_sql("products", conn, if_exists="replace", index=False)
# Close connection
conn.close()
In [ ]:
In [15]:
fig=px.scatter(
df,
x='rating_rate',
y='price',
color='rating_count',
facet_col='category',
facet_col_wrap=3,
hover_data=['title','price','rating_rate','rating_count'],
color_continuous_scale='Sunsetdark',
title='Category+Rating Impact on Price',
height=1000
)
fig.update_traces(marker=dict(size=7, opacity=0.7), selector=dict(mode='markers'))
fig.update_layout(coloraxis_colorbar=dict(title='Rating Count'), showlegend=False)
fig.update_xaxes(title='Rating')
fig.update_yaxes(title='Price (₹)')
fig.show()
fig.write_html("category_rating_vs_price.html")
Loading the dataframe to sql to run queries
In [16]:
conn = sqlite3.connect("product_data.db")
query = "SELECT name FROM sqlite_master WHERE type='table';"
tables = conn.execute(query).fetchall()
print("Existing tables:", tables)
conn.close()
Existing tables: [('products',)]
In [17]:
conn = sqlite3.connect("product_data.db")
df_loaded = pd.read_sql("SELECT * FROM products LIMIT 5;", conn)
print(df_loaded)
conn.close()
id title price \
0 1 Fjallraven - Foldsack No. 1 Backpack, Fits 15 ... 109.95
1 2 Mens Casual Premium Slim Fit T-Shirts 22.30
2 3 Mens Cotton Jacket 55.99
3 4 Mens Casual Slim Fit 15.99
4 5 John Hardy Women's Legends Naga Gold & Silver ... 695.00
description category \
0 Your perfect pack for everyday use and walks i... men's clothing
1 Slim-fitting style, contrast raglan long sleev... men's clothing
2 great outerwear jackets for Spring/Autumn/Wint... men's clothing
3 The color could be slightly different between ... men's clothing
4 From our Legends Collection, the Naga was insp... jewelery
rating_rate rating_count
0 3.9 120
1 4.1 259
2 4.7 500
3 2.1 430
4 4.6 400
In [18]:
conn = sqlite3.connect("product_data.db")
# Run SQL query (Modify this as needed)
query = "SELECT id,price FROM products WHERE price > 100 LIMIT 10;"
df_query_result = pd.read_sql(query, conn)
# Show result
print(df_query_result)
conn.close()
id price 0 1 109.95 1 5 695.00 2 6 168.00 3 10 109.00 4 11 109.00 5 12 114.00 6 13 599.00 7 14 999.99